Citibike ride patterns are significantly influenced by different factors across different seasons
Problem definition, background and brief literature review. Justify your choice of data and provide sources
# Loading neccessary libraries
library(sf)
library(tmap)
library(tmaptools)
library(dplyr)
library(lubridate)
library(ggplot2)
library(gstat)
library(sp)
Source:: https://www.nyc.gov/site/planning/planning-level/nyc-population/2020-census.page#2020-census-results
# Read the census shapefile
census <- st_read(dsn="Assignment/Data/nyc2020_census/nyct2020.shp", layer="nyct2020")
## Reading layer `nyct2020' from data source
## `/Users/xiaoweilim/Desktop/CEGE0097_SAG/Assignment/Data/nyc2020_census/nyct2020.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 2325 features and 14 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: 913175.1 ymin: 120128.4 xmax: 1067383 ymax: 272844.3
## Projected CRS: NAD83 / New York Long Island (ftUS)
# Load the population csv data
population <- read.csv("Assignment/Data/nyc_censusdata_2020.csv")
population$BCT2020 <- as.character(population$BCT2020)
# Join population data to census data
census_joined <- census %>%
left_join(population, by= c("BoroCT2020"="BCT2020"))
# Plot census polygons
plot(census_joined["CT2020"])
Source:: https://citibikenyc.com/system-data
# Load Citibike data
citibike_jan <- read.csv("Assignment/Data/2019-citibike-tripdata/1_January/201901-citibike-tripdata_1.csv")
# If there are multiple csv files e.g. summer months = more rides
# citibike_1 <- read.csv("Assignment/Data/2019-citibike-tripdata/7_July/201907-citibike-tripdata_1.csv")
# citibike_2 <- read.csv("Assignment/Data/2019-citibike-tripdata/7_July/201907-citibike-tripdata_2.csv")
# citibike_3 <- read.csv("Assignment/Data/2019-citibike-tripdata/7_July/201907-citibike-tripdata_3.csv")
# citibike_july <- bind_rows(citibike_1, citibike_2, citibike_3)
Data Wrangling: Cleaning up the Date/Time formatting
# Convert starttime and stoptime into DateTime format
citibike_jan$starttime <- ymd_hms(citibike_jan$starttime)
citibike_jan$stoptime <- ymd_hms(citibike_jan$stoptime)
# Cleaning up for weekday/weekend and hour information
citibike_jan <- citibike_jan %>%
mutate(
# Weekday or weekend
is_weekend = ifelse(wday(starttime) %in% c(1,7), TRUE, FALSE),
# Start time hour
start_hour = hour(starttime)
)
Data Wrangling: Station-Oriented Dataset
The Citibike dataset is currently organised transactionally, with each ride representing one transaction. The raw data is transformed to a station-oriented one, where it has source attributes like id/lat/long, ride counts, and user counts. Additional attributes are computed, such as: i. Time and Day (Weekday or Weekend) these rides start from ii. End Ride Count, which sums the number of bikes returned/docked back at this particular station iii. Median Trip Duration for ride originating from the station or ending at the station
# Create the station dataset with attributes on station id/lat/long, ride counts, user counts and time-based counts
start_station_jan <- citibike_jan %>%
# Group by start.station and user type
group_by(start.station.id, start.station.latitude, start.station.longitude) %>%
summarise(
ride_start_count = n(),
usertype_subscriber_count = sum(usertype == "Subscriber"),
usertype_customer_count = sum(usertype == "Customer"),
# Weekday counts for time intervals AM/PM Peak/Off-Peak - ref Liu et al
ride_start_weekday_7_10 = sum(!is_weekend & start_hour>=7 & start_hour<10),
ride_start_weekday_10_17 = sum(!is_weekend & start_hour>=10 & start_hour<17),
ride_start_weekday_17_0 = sum(!is_weekend & start_hour>=17 & start_hour<24),
ride_start_weekday_0_7 = sum(!is_weekend & start_hour>=0 & start_hour<7),
# Weekend counts for leisure/others - ref Liu et al
ride_start_weekend_10_0 = sum(is_weekend & start_hour>=10 & start_hour<24),
ride_start_weekend_0_10 = sum(is_weekend & start_hour>=0 & start_hour<10)
) %>%
rename(
station_id = start.station.id,
station_latitude = start.station.latitude,
station_longitude = start.station.longitude
)
# Calculate end station counts
end_station_jan <- citibike_jan %>%
group_by(end.station.id, end.station.latitude, end.station.longitude) %>%
summarise(
ride_end_count = n()
) %>%
rename(
station_id = end.station.id,
station_latitude = end.station.latitude,
station_longitude = end.station.longitude
)
# Combine start and end to return ride activity by station
station_jan <- full_join(start_station_jan, end_station_jan,
by= c("station_id", "station_latitude", "station_longitude")) %>%
# Deal with NA values
mutate(
ride_start_count = ifelse(is.na(ride_start_count), 0, ride_start_count),
ride_end_count = ifelse(is.na(ride_end_count), 0, ride_end_count),
ride_activity = ride_start_count + ride_end_count
)
# Add median trip duration for start stations
start_station_duration <- citibike_jan %>%
group_by(start.station.id) %>%
summarise(median_trip_duration_start = median(tripduration, na.rm=TRUE)) %>%
rename(station_id = start.station.id)
# Add median trip duration for end stations
end_station_duration <- citibike_jan %>%
group_by(end.station.id) %>%
summarise(median_trip_duration_end = median(tripduration, na.rm=TRUE)) %>%
rename(station_id = end.station.id)
# Merge median trip duration with station_july
station_jan <- station_jan %>%
left_join(start_station_duration, by= "station_id") %>%
left_join(end_station_duration, by= "station_id")
Plotting the aggregated Citibike Station data
# Convert cleaned July dataset into sf object
station_jan_sf <- station_jan %>%
st_as_sf(coords = c("station_longitude", "station_latitude"), crs=4326, remove=FALSE)
# Plot map of points
tm_shape(station_jan_sf) +
tm_bubbles(size=0.1, col="ride_activity", palette="YlOrRd") +
tm_layout(main.title = "Citibike Jan 2019",
main.title.size=1.5, frame=FALSE, legend.position = c("left", "top"))
Combining the Census and Citibike Station data
# Convert Census polygon crs4269 to crs4329
census_joined <- st_transform(census_joined, crs=st_crs(station_jan_sf))
# Plot interim data
tm_shape(census) +
tm_borders() +
tm_shape(station_jan_sf) +
tm_bubbles(size=0.1, col="ride_activity", palette="YlOrRd") +
tm_layout(main.title = "Citibike Jan 2019 and Census Polygons",
main.title.size=1, frame=FALSE, legend.position= c("left", "top"))
# Creating an aggregated station dataset at census tract level
station_jan_nyc <- station_jan_sf %>%
# Left join to remove stations outside NYC (do not have census tract)
st_join(census_joined, join= st_within, left=FALSE)
ct_station_jan <- station_jan_nyc %>%
group_by(BoroCT2020) %>%
summarise(
num_stations = n(),
total_ride_start_count = sum(ride_start_count, na.rm=TRUE),
total_ride_end_count = sum(ride_end_count, na.rm=TRUE),
total_ride_activity = sum(ride_activity, na.rm=TRUE),
median_trip_duration_start = median(median_trip_duration_start, na.rm=TRUE),
median_trip_duration_end = median(median_trip_duration_end, na.rm=TRUE),
usertype_subscriber_count = sum(usertype_subscriber_count, na.rm=TRUE),
usertype_customer_count = sum(usertype_customer_count, na.rm=TRUE),
# Adding in the time factor
weekday_7_10 = sum(ride_start_weekday_7_10, na.rm=TRUE),
weekday_10_17 = sum(ride_start_weekday_10_17, na.rm=TRUE),
weekday_17_0 = sum(ride_start_weekday_17_0, na.rm=TRUE),
weekday_0_7 = sum(ride_start_weekday_0_7, na.rm=TRUE),
weekend_10_0 = sum(ride_start_weekend_10_0, na.rm=TRUE),
weekend_0_10 = sum(ride_start_weekend_0_10, na.rm=TRUE)
)
# Converting to df to allow for inner join and drop column for polygons without Citibike docks
ct_station_jan_df <- as.data.frame(ct_station_jan)
agg_citibike_jan <- census_joined %>%
inner_join(ct_station_jan_df, by= "BoroCT2020")
# colnames(agg_citibike_jan)
# agg_citibike_jan <- agg_citibike_jan_copy
# List of columns to convert from character to integer
columns_to_convert <- c("Pop1", "PopU5", "Pop5t9", "Pop10t14", "Pop15t19", "Pop20t24", "Pop25t29", "Pop30t34", "Pop35t39", "PopU18", "Pop65pl", "GQClgHsg", "Fam", "HUnits")
# Note: chr values with comma (e.g. "3,512") automatically converts to NA -- therefore need to remove the comma
agg_citibike_jan[columns_to_convert] <- lapply(agg_citibike_jan[columns_to_convert], function(x) {
x <- gsub(",", "", x) # Remove commas
x <- as.numeric(x) # Convert to numeric
x[is.na(x)] <- 0 # Replace NAs with 0
return(x)
})
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in `[<-.data.frame`(`*tmp*`, columns_to_convert, value = list(Pop1 =
## c(11616, : provided 15 variables to replace 14 variables
# Replace NA with 0 for all numeric and integer columns
agg_citibike_jan[sapply(agg_citibike_jan, is.numeric)] <-
lapply(agg_citibike_jan[sapply(agg_citibike_jan, is.numeric)], function(x) {
x[is.na(x)] <- 0
return(x)
})
## Warning in `[<-.data.frame`(`*tmp*`, sapply(agg_citibike_jan, is.numeric), :
## provided 48 variables to replace 47 variables
# Verify if there are any remaining NAs
sum(is.na(agg_citibike_jan))
## [1] 416
# Create a new column 'Pop19t64' as Pop1 - PopU18 - Pop65pl
agg_citibike_jan$Pop19t64 <- agg_citibike_jan$Pop1 - agg_citibike_jan$PopU18 - agg_citibike_jan$Pop65pl
# Verify the result
head(agg_citibike_jan[, c("Pop1", "PopU18", "Pop65pl", "Pop19t64")])
## Simple feature collection with 6 features and 4 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -73.99481 ymin: 40.70913 xmax: -73.9733 ymax: 40.72943
## Geodetic CRS: WGS 84
## Pop1 PopU18 Pop65pl Pop19t64 geometry.x
## 1 11616 1723 3019 6874 MULTIPOLYGON (((-73.99022 4...
## 2 3543 679 784 2080 MULTIPOLYGON (((-73.98837 4...
## 3 7934 845 1201 5888 MULTIPOLYGON (((-73.98985 4...
## 4 6969 878 1333 4758 MULTIPOLYGON (((-73.97875 4...
## 5 3609 489 488 2632 MULTIPOLYGON (((-73.97689 4...
## 6 6819 829 1342 4648 MULTIPOLYGON (((-73.9733 40...
# # Verify if there are any remaining NAs and list the columns with NAs
# remaining_na_cols <- sapply(agg_citibike_jan, function(x) any(is.na(x)))
#
# # Print out the columns with remaining NAs
# if (any(remaining_na_cols)) {
# cat("Columns with remaining NAs:\n")
# print(names(remaining_na_cols[remaining_na_cols == TRUE]))
# } else {
# cat("No remaining NAs in the numeric columns.\n")
# }
# Plotting the map with context
tm_shape(census_joined)+
tm_borders(col= "white")+
tm_fill(col= "lightgrey")+
tm_shape(agg_citibike_jan)+
tm_polygons(col= "total_ride_start_count",
style= "quantile",
palette= "YlOrRd",
title= "Total Ride Start Count")+
tm_layout(title= "Citibike Ride Activity by Census Tract (Jan 2019)",
title.position= c("left", "top"),
legend.position= c("left", "top"),
legend.outside= FALSE,
frame= FALSE)
Summarise the properties of your data
# Plotting the map with context
tm_shape(census_joined)+
tm_borders(col= "white")+
tm_fill(col= "lightgrey")+
tm_shape(agg_citibike_jan)+
tm_polygons(col= "median_trip_duration_start",
style= "quantile",
palette= "RdPu",
title= "Median Trip Duration")+
tm_layout(title= "Citibike Ride Activity by Census Tract (Jan 2019)",
title.position= c("left", "top"),
legend.position= c("left", "top"),
legend.outside= FALSE,
frame= FALSE)
# Plot histogram
ggplot(data=agg_citibike_jan, aes(total_ride_activity)) + geom_histogram()
#####Spatial Autocorrelation Test
Global Moran’s I
library(spdep)
library(knitr)
nb <- poly2nb(agg_citibike_jan)
## Warning in poly2nb(agg_citibike_jan): some observations have no neighbours;
## if this seems unexpected, try increasing the snap argument.
## Warning in poly2nb(agg_citibike_jan): neighbour object has 6 sub-graphs;
## if this sub-graph count seems unexpected, try increasing the snap argument.
W <- nb2mat(nb, style='W', zero.policy = TRUE)
colnames(W) <- rownames(W)
# Creating a weights list
Wl <- nb2listw(nb, zero.policy = TRUE)
moran(agg_citibike_jan$total_ride_start_count, Wl, n=length(Wl$neighbours), S0=Szero(Wl))
## $I
## [1] 0.4743091
##
## $K
## [1] 11.69548
# Test under randomisation
moran.test(agg_citibike_jan$total_ride_start_count, Wl)
##
## Moran I test under randomisation
##
## data: agg_citibike_jan$total_ride_start_count
## weights: Wl
## n reduced by no-neighbour observations
##
## Moran I statistic standard deviate = 14.844, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Moran I statistic Expectation Variance
## 0.469748398 -0.002433090 0.001011878
# Test using Monte-Carlo simulation
moran.mc(agg_citibike_jan$total_ride_start_count, Wl, nsim=999)
##
## Monte-Carlo simulation of Moran I
##
## data: agg_citibike_jan$total_ride_start_count
## weights: Wl
## number of simulations + 1: 1000
##
## statistic = 0.46975, observed rank = 1000, p-value = 0.001
## alternative hypothesis: greater
At the census tract level, total ride start counts display significant, positive autocorrelation
Moran’s Scatterplot
# Moran Scatterplot
moran.plot(agg_citibike_jan$total_ride_start_count, Wl, xlab='Total Ride Start Count', ylab='Spatially Lagged Ride Start Count', labels=agg_citibike_jan$NTAName)
[add in finding - Central Park, most values in low-low]
Gi and Gi*
# Gi and Gi*
# Based 1.6km roughly off the width of 2 Manhattan blocks - also see comparison of 4km vs 2km in commented block below
NbrL <- dnearneigh(st_centroid(agg_citibike_jan), 0, 1.6)
## Warning: st_centroid assumes attributes are constant over geometries
## Warning in dnearneigh(st_centroid(agg_citibike_jan), 0, 1.6): neighbour object
## has 2 sub-graphs
# Creating a list of neighbours not including self
D <- nb2listw(NbrL, style='B')
# Creating a list of neighbours, but include self
D_star <- nb2listw(include.self(NbrL), style='B')
G <- globalG.test(agg_citibike_jan$total_ride_start_count, D)
G <- globalG.test(agg_citibike_jan$total_ride_start_count, D_star)
# Check if there are any isolated observations (no neighbors) by returning the length (number of neighbours) for each polygon
sapply(NbrL, length)
## [1] 29 31 39 33 32 32 29 39 28 37 45 48 26 23 41 45 39 44 41 42 41 42 47 44 40
## [26] 38 32 31 37 33 33 43 31 32 34 37 41 28 42 38 28 39 36 29 40 42 41 33 24 32
## [51] 40 38 39 32 41 40 37 31 43 29 28 43 28 35 33 33 42 31 33 33 40 32 26 30 22
## [76] 24 29 27 28 27 29 24 23 27 25 23 27 21 26 17 28 21 31 17 21 26 21 32 34 18
## [101] 26 23 19 18 21 22 22 17 19 34 33 26 22 7 9 6 12 13 11 17 21 21 24 25 27
## [126] 25 21 23 28 27 23 28 30 28 23 21 20 18 15 16 16 13 18 20 15 20 11 10 3 42
## [151] 20 41 33 39 20 25 29 30 35 31 20 9 6 10 18 10 23 20 20 25 38 31 30 27 31
## [176] 14 29 18 22 29 28 27 27 31 16 14 40 45 47 42 28 30 35 36 29 27 25 28 29 28
## [201] 30 32 34 41 28 24 17 15 25 25 28 12 32 43 40 20 29 19 16 22 17 18 24 20 22
## [226] 33 18 19 26 26 26 28 26 38 40 37 41 40 40 43 45 38 30 24 6 25 27 37 36 44
## [251] 13 36 26 45 44 40 40 35 29 22 17 15 15 22 26 31 37 36 37 37 38 35 31 30 36
## [276] 40 39 35 36 34 32 25 16 21 18 25 31 35 36 31 31 32 30 33 30 28 29 30 27 28
## [301] 29 28 23 13 15 19 24 26 28 25 20 10 27 10 13 19 22 23 23 24 28 25 26 28 29
## [326] 28 25 26 25 25 29 30 20 23 28 27 15 21 16 18 31 31 36 29 24 19 28 31 21 22
## [351] 14 25 16 30 24 25 24 18 27 27 8 19 29 27 17 17 30 35 24 31 31 12 31 14 10
## [376] 24 32 33 25 17 22 25 26 13 13 32 30 24 21 14 31 35 11 11 12 25 20 18 24 27
## [401] 25 28 14 21 18 18 19 17 16 1 45 44 32 33 33 34
# > Compare to: sapply(NbrL, length) when NbrL=4
# [1] 163 166 155 170 167 174 124 166 116 170 153 154 111 105 167 161 139 143 133 154 146 136 144 151 137 133
# [27] 120 174 131 167 122 151 123 170 166 137 161 126 154 142 127 142 169 130 163 145 159 131 115 167 145 161
# [53] 156 170 144 154 163 165 148 166 167 149 163 136 130 160 150 165 133 161 148 133 132 147 119 128 154 150
# [79] 147 142 139 132 131 122 127 130 125 129 115 115 109 123 107 112 118 104 110 89 91 94 88 79 78 66
# [105] 68 70 66 57 57 87 80 71 62 15 15 15 15 15 15 155 107 116 118 111 106 101 95 90 85 84
# [131] 91 93 96 105 106 129 89 83 79 74 65 56 64 67 59 80 98 98 15 159 122 163 169 164 98 90
# [157] 164 158 151 171 78 15 15 15 113 15 76 162 174 96 148 133 130 177 169 67 174 168 114 161 156 153
# [183] 78 84 115 110 161 143 147 157 168 166 126 132 166 166 164 164 162 160 152 163 167 160 95 94 107 99
# [209] 69 76 77 15 168 155 159 110 136 113 134 137 161 135 114 143 119 130 119 115 151 138 140 146 127 162
# [235] 163 163 153 140 150 137 140 139 125 107 66 93 106 128 118 125 77 103 88 128 128 129 117 105 98 92
# [261] 88 84 87 96 100 104 111 124 121 139 149 158 170 166 149 145 131 129 120 117 112 174 97 104 97 105
# [287] 121 127 134 132 124 115 111 118 124 130 117 112 111 112 111 107 98 79 83 89 93 97 96 104 87 68
# [313] 115 86 91 98 95 96 105 112 119 112 119 127 125 139 138 142 159 160 142 144 173 173 157 155 158 114
# [339] 147 99 141 144 109 96 122 113 178 177 94 95 84 130 97 162 116 115 178 83 177 172 81 125 149 172
# [365] 164 142 166 160 125 138 133 86 173 58 69 115 130 134 138 84 112 124 123 15 15 128 120 116 107 83
# [391] 173 176 15 15 96 175 86 73 168 166 118 120 105 111 86 83 143 151 92 40 155 146 172 174 144 153
Gi <- localG(agg_citibike_jan$total_ride_start_count, D)
agg_citibike_jan$Gi <- Gi
Gi_star <- localG(agg_citibike_jan$total_ride_start_count, D_star)
agg_citibike_jan$Gi_star <- Gi_star
tm_shape(agg_citibike_jan) + tm_polygons(col='Gi', palette='-RdBu', style='quantile')
tm_shape(agg_citibike_jan) + tm_polygons(col='Gi_star', palette='-RdBu', style='quantile')
[add in finding - Midtown Manhattan clusters]
Local Moran’s I
# Local Moran's I
Ii <- localmoran(agg_citibike_jan$total_ride_start_count, Wl)
agg_citibike_jan$Ii <- Ii[,'Ii']
tm_shape(agg_citibike_jan) + tm_polygons(col='Ii', palette='-RdBu', style='quantile')
# Adjusting the p-value
agg_citibike_jan$Iip_unadjusted <- Ii[,'Pr(z != E(Ii))']
agg_citibike_jan$Ii_un_sig <- 'nonsignificant'
agg_citibike_jan$Ii_un_sig[which(agg_citibike_jan$Iip_unadjusted < 0.05)] <- 'significant'
tm_shape(agg_citibike_jan) + tm_polygons(col='Ii_un_sig', palette='-RdBu')
# Adjusting with the Bonferroni method
agg_citibike_jan$Iip_adjusted <- p.adjust(agg_citibike_jan$Iip_unadjusted, method='bonferroni')
agg_citibike_jan$Ii_ad_sig <- 'nonsignificant'
agg_citibike_jan$Ii_ad_sig[which(agg_citibike_jan$Iip_adjusted < 0.05)] <- 'significant'
tm_shape(agg_citibike_jan) + tm_polygons(col='Ii_ad_sig', palette='-RdBu')
# Looking for clusters from local Moran
moranCluster <- function(shape, W, var, alpha=0.05, p.adjust.method='bonferroni')
{
# Code adapted from https://rpubs.com/Hailstone/346625
Ii <- localmoran(shape[[var]], W)
shape$Ii <- Ii[,"Ii"]
Iip <- p.adjust(Ii[,"Pr(z != E(Ii))"], method=p.adjust.method)
shape$Iip <- Iip
shape$sig <- shape$Iip<alpha
# Scale the data to obtain low and high values
shape$scaled <- scale(shape[[var]]) # high low values at location i
shape$lag_scaled <- lag.listw(Wl, shape$scaled) # high low values at neighbours j
shape$lag_cat <- factor(ifelse(shape$scaled>0 & shape$lag_scaled>0, "HH",
ifelse(shape$scaled>0 & shape$lag_scaled<0, "HL",
ifelse(shape$scaled<0 & shape$lag_scaled<0, "LL",
ifelse(shape$scaled<0 & shape$lag_scaled<0, "LH", "Equivalent")))))
shape$sig_cluster <- as.character(shape$lag_cat)
shape$sig_cluster[!shape$sig] <- "Non-sig"
shape$sig_cluster <- as.factor(shape$sig_cluster)
results <- data.frame(Ii=shape$Ii, pvalue=shape$Iip, type=shape$lag_cat, sig=shape$sig_cluster)
return(list(results=results))
}
clusters <- moranCluster(agg_citibike_jan, W=Wl, var='total_ride_start_count')$results
agg_citibike_jan$Ii_cluster <- clusters$sig
tm_shape(agg_citibike_jan) + tm_polygons(col='Ii_cluster')
Describe the method you are using to analyse the data
OLS Linear Model
# Testing for spatial autocorrelation in regression errors
janbike.lm <- lm(total_ride_start_count ~ PopU18 + Pop19t64 + Pop65pl + HUnits + Fam + AvgHHSz + GQClgHsg + Shape_Area + num_stations, data=agg_citibike_jan)
# Saving the residuals as a column
agg_citibike_jan$lm.res <- residuals(janbike.lm)
# Plotting the residuals
tm_shape(agg_citibike_jan)+tm_polygons('lm.res', palette='-RdBu', style='quantile')
# # Testing for residual autocorrelation
# # Created a weights list for the Global Moran's I calculation earlier
# nb <- poly2nb(agg_citibike_jan)
# Wl <- nb2listw(nb, zero.policy = TRUE)
#
# # Created a weights list based off distance for the Gi and Gi* calculation earlier
# NbrL <- dnearneigh(st_centroid(agg_citibike_jan), 0, 1.6)
# D <- nb2listw(NbrL, style='B')
lm.morantest(janbike.lm, D)
##
## Global Moran I for regression residuals
##
## data:
## model: lm(formula = total_ride_start_count ~ PopU18 + Pop19t64 +
## Pop65pl + HUnits + Fam + AvgHHSz + GQClgHsg + Shape_Area +
## num_stations, data = agg_citibike_jan)
## weights: D
##
## Moran I statistic standard deviate = 28.714, p-value < 2.2e-16
## alternative hypothesis: greater
## sample estimates:
## Observed Moran I Expectation Variance
## 0.3328527652 -0.0060070098 0.0001392648
lm.LMtests(janbike.lm, D, test='RLMlag')
## Warning in lm.RStests(model = model, listw = listw, zero.policy = zero.policy,
## : Spatial weights matrix not row standardized
##
## Rao's score (a.k.a Lagrange multiplier) diagnostics for spatial
## dependence
##
## data:
## model: lm(formula = total_ride_start_count ~ PopU18 + Pop19t64 +
## Pop65pl + HUnits + Fam + AvgHHSz + GQClgHsg + Shape_Area +
## num_stations, data = agg_citibike_jan)
## test weights: listw
##
## adjRSlag = 158.17, df = 1, p-value < 2.2e-16
lm.LMtests(janbike.lm, D, test='RLMerr')
## Warning in lm.RStests(model = model, listw = listw, zero.policy = zero.policy,
## : Spatial weights matrix not row standardized
##
## Rao's score (a.k.a Lagrange multiplier) diagnostics for spatial
## dependence
##
## data:
## model: lm(formula = total_ride_start_count ~ PopU18 + Pop19t64 +
## Pop65pl + HUnits + Fam + AvgHHSz + GQClgHsg + Shape_Area +
## num_stations, data = agg_citibike_jan)
## test weights: listw
##
## adjRSerr = 322.01, df = 1, p-value < 2.2e-16
Spatial Lag Regression
# Load regression libraries
library(spgwr)
library(spatialreg)
# Spatial Lag Regression
janbike.lag <- lagsarlm(total_ride_start_count ~ PopU18 + Pop19t64 + Pop65pl + HUnits + Fam + AvgHHSz + GQClgHsg + Shape_Area + num_stations, data=agg_citibike_jan, listw=D)
## Warning in lagsarlm(total_ride_start_count ~ PopU18 + Pop19t64 + Pop65pl + : inversion of asymptotic covariance matrix failed for tol.solve = 2.22044604925031e-16
## reciprocal condition number = 2.39674e-20 - using numerical Hessian.
## Warning in sqrt(diag(fdHess)[-1]): NaNs produced
summary(janbike.lag)
##
## Call:lagsarlm(formula = total_ride_start_count ~ PopU18 + Pop19t64 +
## Pop65pl + HUnits + Fam + AvgHHSz + GQClgHsg + Shape_Area +
## num_stations, data = agg_citibike_jan, listw = D)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3681.96 -670.06 37.82 635.87 9969.37
##
## Type: lag
## Coefficients: (numerical Hessian approximate standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.5150e+03 5.9088e+02 -4.2563 2.078e-05
## PopU18 9.1995e-02 4.4801e-01 0.2053 0.837307
## Pop19t64 -1.3586e-01 8.8707e-02 -1.5315 0.125640
## Pop65pl -4.2706e-01 2.7530e-01 -1.5513 0.120837
## HUnits 5.4746e-01 1.8049e-01 3.0331 0.002421
## Fam -2.8655e-01 6.9216e-01 -0.4140 0.678876
## AvgHHSz 1.8653e+02 2.4386e+02 0.7649 0.444325
## GQClgHsg 1.4038e-02 NaN NaN NaN
## Shape_Area 3.9809e-05 3.7186e-05 1.0705 0.284377
## num_stations 1.2015e+03 7.4877e+01 16.0469 < 2.2e-16
##
## Rho: 0.0227, LR test value: 247.51, p-value: < 2.22e-16
## Approximate (numerical Hessian) standard error: 0.0013363
## z-value: 16.987, p-value: < 2.22e-16
## Wald statistic: 288.57, p-value: < 2.22e-16
##
## Log likelihood: -3612.249 for lag model
## ML residual variance (sigma squared): 1995200, (sigma: 1412.5)
## Number of observations: 416
## Number of parameters estimated: 12
## AIC: 7248.5, (AIC for lm: 7494)
agg_citibike_jan$lag.res <- residuals(janbike.lag)
tm_shape(agg_citibike_jan) + tm_polygons('lag.res', palette='-RdBu', style='quantile')
Spatial Error Regression
# Spatial Error Regression
janbike.err <- errorsarlm(total_ride_start_count ~ PopU18 + Pop19t64 + Pop65pl + HUnits + Fam + AvgHHSz + GQClgHsg + Shape_Area + num_stations, data=agg_citibike_jan, listw=D)
## Warning in errorsarlm(total_ride_start_count ~ PopU18 + Pop19t64 + Pop65pl + : inversion of asymptotic covariance matrix failed for tol.solve = 2.22044604925031e-16
## reciprocal condition number = 3.77442e-20 - using numerical Hessian.
summary(janbike.err)
##
## Call:errorsarlm(formula = total_ride_start_count ~ PopU18 + Pop19t64 +
## Pop65pl + HUnits + Fam + AvgHHSz + GQClgHsg + Shape_Area +
## num_stations, data = agg_citibike_jan, listw = D)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3636.986 -675.220 99.102 545.406 9844.031
##
## Type: error
## Coefficients: (asymptotic standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.1134e+03 5.2780e+02 -4.0041 6.225e-05
## PopU18 -1.4544e-01 3.4930e-01 -0.4164 0.67713
## Pop19t64 -2.0265e-01 2.0792e-01 -0.9746 0.32973
## Pop65pl -5.3638e-01 3.2736e-01 -1.6385 0.10132
## HUnits 4.8465e-01 2.5749e-01 1.8822 0.05981
## Fam 3.3913e-01 6.3178e-01 0.5368 0.59141
## AvgHHSz 2.1974e+02 2.1829e+02 1.0066 0.31412
## GQClgHsg 3.1507e-02 2.8754e-01 0.1096 0.91275
## Shape_Area 3.1857e-05 3.6138e-05 0.8815 0.37803
## num_stations 1.2217e+03 7.3690e+01 16.5793 < 2.2e-16
##
## Lambda: 0.026931, LR test value: 245.47, p-value: < 2.22e-16
## Approximate (numerical Hessian) standard error: 0.00067988
## z-value: 39.611, p-value: < 2.22e-16
## Wald statistic: 1569.1, p-value: < 2.22e-16
##
## Log likelihood: -3613.266 for error model
## ML residual variance (sigma squared): 1967600, (sigma: 1402.7)
## Number of observations: 416
## Number of parameters estimated: 12
## AIC: 7250.5, (AIC for lm: 7494)
agg_citibike_jan$err.res <- residuals(janbike.err)
agg_citibike_jan$err.fit <- exp(fitted.values(janbike.err))
tm_shape(agg_citibike_jan) + tm_polygons('err.res', palette='-RdBu', style='quantile')
Present the results of your individual section
Combine and discuss results here (e.g. for final site selection, or to discuss the relative merits of approaches). Include limitations and future work
+++++++
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.